{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "vertical-dining",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import geopandas as gpd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "infectious-heavy",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ICE-Spark baseline comparision\n",
    "# ST join task for Sentinel-2 and Landsat-8 datasets (for 1 month, 3 months, and 6 months)\n",
    "# the temporal delay parameter has already been set in the dataset\n",
    "# first sjoin and then temporal filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "korean-central",
   "metadata": {},
   "outputs": [],
   "source": [
    "# read Sentinel data\n",
    "S2_df = pd.read_csv('../../Metadata/for_PySpark/meta_s2_2018_cloud_sep.csv', sep='|')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "democratic-saint",
   "metadata": {},
   "outputs": [],
   "source": [
    "# read Landsat data\n",
    "L8_df = pd.read_csv('../../Metadata/for_PySpark/meta_l8_2018_cloud_sep.csv', sep='|')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "variable-imagination",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>system:index</th>\n",
       "      <th>cloudcover</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>.geo</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>LC08_001004_20180706</td>\n",
       "      <td>58.52</td>\n",
       "      <td>2018-07-06T14:07:29.280</td>\n",
       "      <td>{\"type\":\"LinearRing\",\"coordinates\":[[-10.92441...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>LC08_001004_20180722</td>\n",
       "      <td>85.11</td>\n",
       "      <td>2018-07-22T14:07:36.104</td>\n",
       "      <td>{\"type\":\"LinearRing\",\"coordinates\":[[-16.03892...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>LC08_001004_20180807</td>\n",
       "      <td>0.08</td>\n",
       "      <td>2018-08-07T14:07:44.903</td>\n",
       "      <td>{\"type\":\"LinearRing\",\"coordinates\":[[-12.59653...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>LC08_001004_20180908</td>\n",
       "      <td>1.85</td>\n",
       "      <td>2018-09-08T14:07:58.835</td>\n",
       "      <td>{\"type\":\"LinearRing\",\"coordinates\":[[-13.42930...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>LC08_001005_20180417</td>\n",
       "      <td>4.23</td>\n",
       "      <td>2018-04-17T14:08:04.488</td>\n",
       "      <td>{\"type\":\"LinearRing\",\"coordinates\":[[-19.20447...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>155919</th>\n",
       "      <td>155919</td>\n",
       "      <td>LC08_233248_20180512</td>\n",
       "      <td>62.10</td>\n",
       "      <td>2018-05-12T15:38:32.686</td>\n",
       "      <td>{\"type\":\"LinearRing\",\"coordinates\":[[-4.485504...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>155920</th>\n",
       "      <td>155920</td>\n",
       "      <td>LC08_233248_20180528</td>\n",
       "      <td>29.51</td>\n",
       "      <td>2018-05-28T15:38:20.079</td>\n",
       "      <td>{\"type\":\"LinearRing\",\"coordinates\":[[-9.161491...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>155921</th>\n",
       "      <td>155921</td>\n",
       "      <td>LC08_233248_20180731</td>\n",
       "      <td>55.91</td>\n",
       "      <td>2018-07-31T15:38:47.521</td>\n",
       "      <td>{\"type\":\"LinearRing\",\"coordinates\":[[-7.181117...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>155922</th>\n",
       "      <td>155922</td>\n",
       "      <td>LC08_233248_20180816</td>\n",
       "      <td>65.53</td>\n",
       "      <td>2018-08-16T15:38:56.608</td>\n",
       "      <td>{\"type\":\"LinearRing\",\"coordinates\":[[-8.378385...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>155923</th>\n",
       "      <td>155923</td>\n",
       "      <td>LC08_233248_20180901</td>\n",
       "      <td>61.15</td>\n",
       "      <td>2018-09-01T15:39:03.417</td>\n",
       "      <td>{\"type\":\"LinearRing\",\"coordinates\":[[-7.864663...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>155924 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Unnamed: 0          system:index  cloudcover                timestamp  \\\n",
       "0                0  LC08_001004_20180706       58.52  2018-07-06T14:07:29.280   \n",
       "1                1  LC08_001004_20180722       85.11  2018-07-22T14:07:36.104   \n",
       "2                2  LC08_001004_20180807        0.08  2018-08-07T14:07:44.903   \n",
       "3                3  LC08_001004_20180908        1.85  2018-09-08T14:07:58.835   \n",
       "4                4  LC08_001005_20180417        4.23  2018-04-17T14:08:04.488   \n",
       "...            ...                   ...         ...                      ...   \n",
       "155919      155919  LC08_233248_20180512       62.10  2018-05-12T15:38:32.686   \n",
       "155920      155920  LC08_233248_20180528       29.51  2018-05-28T15:38:20.079   \n",
       "155921      155921  LC08_233248_20180731       55.91  2018-07-31T15:38:47.521   \n",
       "155922      155922  LC08_233248_20180816       65.53  2018-08-16T15:38:56.608   \n",
       "155923      155923  LC08_233248_20180901       61.15  2018-09-01T15:39:03.417   \n",
       "\n",
       "                                                     .geo  \n",
       "0       {\"type\":\"LinearRing\",\"coordinates\":[[-10.92441...  \n",
       "1       {\"type\":\"LinearRing\",\"coordinates\":[[-16.03892...  \n",
       "2       {\"type\":\"LinearRing\",\"coordinates\":[[-12.59653...  \n",
       "3       {\"type\":\"LinearRing\",\"coordinates\":[[-13.42930...  \n",
       "4       {\"type\":\"LinearRing\",\"coordinates\":[[-19.20447...  \n",
       "...                                                   ...  \n",
       "155919  {\"type\":\"LinearRing\",\"coordinates\":[[-4.485504...  \n",
       "155920  {\"type\":\"LinearRing\",\"coordinates\":[[-9.161491...  \n",
       "155921  {\"type\":\"LinearRing\",\"coordinates\":[[-7.181117...  \n",
       "155922  {\"type\":\"LinearRing\",\"coordinates\":[[-8.378385...  \n",
       "155923  {\"type\":\"LinearRing\",\"coordinates\":[[-7.864663...  \n",
       "\n",
       "[155924 rows x 5 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "L8_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "backed-attachment",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "15ca9cee-7793-4556-86c0-41a7427c126f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def generate_WKT(GEE_coo_str):\n",
    "    if 'LinearRing' not in GEE_coo_str:\n",
    "        print('error geo type')\n",
    "        return 'unknown geo'\n",
    "    if re.search('(\\[\\[.+\\]\\])', GEE_coo_str):\n",
    "        gee_coo0 = re.search('(\\[\\[.+\\]\\])', GEE_coo_str).group(1)\n",
    "        test_com = re.compile('(\\[.+?)(\\,)(.+?\\])')\n",
    "        gee_coos_1 = test_com.sub(r'\\1 \\3', gee_coo0)\n",
    "        gee_coos_2 = gee_coos_1.replace('[', '').replace(']', '').replace(',', ', ')\n",
    "        return 'POLYGON ((' + gee_coos_2 + '))'\n",
    "    else:\n",
    "        return 'unknown format'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1933566f-f5b5-4658-810a-9e0107c36de3",
   "metadata": {},
   "outputs": [],
   "source": [
    "L8_df['WKT_geo'] = L8_df['.geo'].map(generate_WKT)\n",
    "S2_df['WKT_geo'] = S2_df['.geo'].map(generate_WKT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "50a9d2c6-867a-4ad9-9858-1771c26ce8fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# S2_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2cc934dd-65b7-49e3-ade8-ad83c514bbd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "L8_gdf = gpd.GeoDataFrame({'L8_index': L8_df['system:index'], 'timestamp': pd.to_datetime(L8_df['timestamp'].map(lambda x: x.split('.')[0] if '.' in x else x), format='%Y-%m-%dT%H:%M:%S'), 'geometry': gpd.GeoSeries.from_wkt(L8_df['WKT_geo'])})\n",
    "S2_gdf = gpd.GeoDataFrame({'S2_index': S2_df['system:index'], 'timestamp': pd.to_datetime(S2_df['timestamp'].map(lambda x: x.split('.')[0] if '.' in x else x), format='%Y-%m-%dT%H:%M:%S'), 'geometry': gpd.GeoSeries.from_wkt(S2_df['WKT_geo'])})\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "76a9878a-f4e4-4f69-88b4-23717386cfaa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# L8_gdf['timestamp'].map(lambda x: x.replace(tzinfo=timezone.utc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "b0582081-e1dd-4ae8-a5b3-78281a0f801b",
   "metadata": {},
   "outputs": [],
   "source": [
    "S2_gdf['S2_timestamp_unix'] = S2_gdf['timestamp'].map(lambda x: x.timestamp())\n",
    "L8_gdf['L8_timestamp_unix'] = L8_gdf['timestamp'].map(lambda x: x.timestamp())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "clinical-least",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # first test 1 month data\n",
    "\n",
    "import datetime\n",
    "import time\n",
    "from datetime import timezone\n",
    "\n",
    "num_month = 6\n",
    "# assigned regular string date\n",
    "start_date_time = datetime.datetime(2018, 1, 1, 0, 0).replace(tzinfo=timezone.utc)\n",
    "end_date_time = datetime.datetime(2018, 3, 31, 23, 59).replace(tzinfo=timezone.utc)\n",
    "\n",
    "start_date_time_unix = start_date_time.timestamp()\n",
    "end_date_time_unix = end_date_time.timestamp()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b4d36de-a31c-406e-9366-2d7f6b8bdabe",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "dd580040-475c-407e-8389-28bb9e7a82a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "S2_gdf = S2_gdf[(S2_gdf['S2_timestamp_unix'] <= end_date_time_unix) & (S2_gdf['S2_timestamp_unix'] >= start_date_time_unix)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "091b2fb2-cf8c-4204-9c90-1fa4527efb1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "L8_gdf = L8_gdf[(L8_gdf['L8_timestamp_unix'] <= end_date_time_unix) & (L8_gdf['L8_timestamp_unix'] >= start_date_time_unix)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "8f37bcc2-62a5-4072-9177-6b6d76ae4d89",
   "metadata": {},
   "outputs": [],
   "source": [
    "S2_gdf = S2_gdf.rename({'timestamp': 'S2_timestamp'}, axis=1)\n",
    "L8_gdf = L8_gdf.rename({'timestamp': 'L8_timestamp'}, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0467273e-c957-4b11-bc77-e43c87debb9b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "b5d6bbc8-9cc5-4eb1-ac38-cc5434913542",
   "metadata": {},
   "outputs": [],
   "source": [
    "# L8_gdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "5191d907-1c51-4438-8eea-99563efb0d34",
   "metadata": {},
   "outputs": [],
   "source": [
    "# S2_gdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "known-stock",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "def test_pandas_sjoin(delay_length):\n",
    "    S2_gdf['S2_time_max'] = S2_gdf['S2_timestamp_unix'] + delay_length * 3600\n",
    "    S2_gdf['S2_time_min'] = S2_gdf['S2_timestamp_unix'] - delay_length * 3600\n",
    "    \n",
    "    start_spatial_time = time.time()\n",
    "    S2L8_sjoin = gpd.sjoin(S2_gdf, L8_gdf)\n",
    "    \n",
    "    S2L8_sjoin = S2L8_sjoin[(S2L8_sjoin['L8_timestamp_unix'] <= S2L8_sjoin['S2_time_max']) & (S2L8_sjoin['L8_timestamp_unix'] >= S2L8_sjoin['S2_time_min'])]\n",
    "    S2L8_sjoin[['S2_index', 'L8_index']].to_csv('S2L8_coincident_{}month.csv'.format(num_month))\n",
    "    with open('ICESpark_S2L8_baseline_{}month_all_{}_delay_geopandas.txt'.format(str(num_month), str(delay_length)), 'w') as f:\n",
    "        f.write('time for 1 month ST join using geopanas:')\n",
    "        f.write(str(time.time() - start_spatial_time))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "extreme-enzyme",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "ranking-ethiopia",
   "metadata": {},
   "outputs": [],
   "source": [
    "for delay_i in [ 6]:\n",
    "    test_pandas_sjoin(delay_i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "novel-briefs",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6083d045-63e1-49d4-ae7f-4ba9ce174a6c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
